# (1) Load demographic data
credential <- Sys.getenv("census_api_key")
lookup_var <- load_variables(2020, "acs5", cache = TRUE)
glimpse(lookup_var)
## Rows: 27,850
## Columns: 3
## $ name <chr> "B01001_001", "B01001_002", "B01001_003", "B01001_004", "B0100~
## $ label <chr> "Estimate!!Total:", "Estimate!!Total:!!Male:", "Estimate!!Tota~
## $ concept <chr> "SEX BY AGE", "SEX BY AGE", "SEX BY AGE", "SEX BY AGE", "SEX B~
race <- c(white = "B02001_002", #race
black = "B02001_003",
indian_alaska = "B02001_004",
asian = "B02001_005",
pacific = "B02001_006")
education_attainment <- c(no_schooling = "B15003_002", #education_attainment
elementary_school = "B15003_009",
middle_school = "B15003_012",
high_school = "B15003_017",
bachelor = "B15003_022")
employment_status <- c(employed = "B23025_004", # employment_status
unemployed = "B23025_005",
no_in_labor_force = "23025_007")
sex_by_age <- c(total_by_age ="B01001_001", #sex_by_age
male_by_age_total = "B01001_002",
female_by_age_total = "B01001_026")
demogdata <- get_acs(
geography = "tract",
year = 2020,
variables = c(race,
education_attainment,
employment_status,
sex_by_age,
hhincome = "B19013_001", # median household income
poverty = "B17020_002"), # poverty level
key = credential,
state = 36,
county = c(005, 047, 061, 081, 085))
# (2) Tidy Demographic Data
demo_clean <- demogdata %>%
select(-moe) %>%
#Writes all column names in lower_case
rename_all(~str_to_lower(.)) %>%
#Replaces the white space in column names with underscores
rename_all(~str_replace_all(., " ", "_")) %>%
separate(name, c("census_tract","county", "state"), sep = ", ") %>%
select(-state) %>%
pivot_wider(names_from = variable,
values_from = estimate)
# (3) Missing Data Imputation
# Decided to just impute with the mean-replacing NAs with income average per county
mean_impute <- demo_clean %>%
select(geoid,county, hhincome) %>%
group_by(county) %>%
filter(!is.na(hhincome)) %>%
summarise(hhincome = mean(hhincome))
mean_impute
## # A tibble: 5 x 2
## county hhincome
## <chr> <dbl>
## 1 Bronx County 47615.
## 2 Kings County 71143.
## 3 New York County 104136.
## 4 Queens County 77123.
## 5 Richmond County 85720.
demo_clean <- demo_clean %>%
left_join(mean_impute, by = c("county")) %>%
mutate(hhincome = ifelse(is.na(hhincome.x), hhincome.y, hhincome.x)) %>%
select(-hhincome.y, -hhincome.x)
view(demo_clean)
# (4) Mutate more variables
# Calculate the percentage of minorities
demo_clean$rate_of_minorities <- round((demo_clean$black + demo_clean$asian + demo_clean$indian_alaska + demo_clean$pacific)/(demo_clean$white + demo_clean$black + demo_clean$asian + demo_clean$indian_alaska + demo_clean$pacific) * 100, 1) %>%
replace(., is.na(.), 0)
# Calculate the employment rate
demo_clean$employment_rate <- round((demo_clean$employed)/(demo_clean$employed + demo_clean$unemployed) * 100, 1) %>%
replace(., is.nan(.), 0)
# (5) Add Geometry and Convert to sf Object
geo <- tracts(
state = 36,
county = c(005, 047, 061, 081, 085),
cb = TRUE
) %>%
st_transform(crs = 4326) %>%
select(GEOID, geometry)
##
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|=== | 4%
|
|=== | 5%
|
|==== | 6%
|
|===== | 7%
|
|===== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======= | 11%
|
|======== | 11%
|
|======== | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============ | 18%
|
|============= | 19%
|
|============== | 20%
|
|============== | 21%
|
|================ | 23%
|
|================= | 24%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|==================== | 28%
|
|===================== | 30%
|
|====================== | 31%
|
|======================= | 33%
|
|======================== | 34%
|
|========================= | 36%
|
|========================== | 37%
|
|=========================== | 38%
|
|============================ | 40%
|
|============================= | 41%
|
|============================== | 43%
|
|=============================== | 44%
|
|================================ | 46%
|
|================================= | 47%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|===================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|======================================= | 55%
|
|======================================= | 56%
|
|========================================= | 58%
|
|========================================= | 59%
|
|=========================================== | 61%
|
|============================================ | 62%
|
|============================================= | 64%
|
|============================================== | 65%
|
|=============================================== | 67%
|
|=============================================== | 68%
|
|================================================ | 68%
|
|================================================= | 70%
|
|================================================== | 71%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|======================================================= | 79%
|
|======================================================== | 80%
|
|========================================================= | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|=========================================================== | 85%
|
|============================================================ | 86%
|
|============================================================= | 88%
|
|============================================================== | 89%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|==================================================================== | 97%
|
|==================================================================== | 98%
|
|======================================================================| 100%
demography_sf <- left_join(
x = demo_clean,
y = geo,
by = c("geoid" = "GEOID")) %>%
st_as_sf() %>%
st_transform(crs = 4326) %>%
mutate(geoid = as.numeric(geoid))
floodplain500 <- st_read("data/Sea Level Rise Maps (2050s 500-year Floodplain)/geo_export_e0ab4151-ea79-43e0-b860-b256c9a861bf.shp") %>%
st_transform(crs = 4326)
## Reading layer `geo_export_e0ab4151-ea79-43e0-b860-b256c9a861bf' from data source `C:\Users\zheng\Desktop\670 DS\FinalProject\data\Sea Level Rise Maps (2050s 500-year Floodplain)\geo_export_e0ab4151-ea79-43e0-b860-b256c9a861bf.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 7055 features and 7 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -74.26444 ymin: 40.49321 xmax: -73.71676 ymax: 40.92189
## Geodetic CRS: WGS84(DD)
# joins floodplain with census tract map and codes tracts by whether they are in floodplain
sf_use_s2(FALSE)
floodtracts <- st_join(geo,floodplain500, join = st_intersects, left = FALSE)
flood_demography_sf <- demography_sf %>%
mutate(flood = ifelse(geoid %in% floodtracts$GEOID, 1, 0)) %>%
na.omit() %>%
mutate(flood = factor(flood))
# splits data into training and testing groups (keep both sf and df objects because we need sf objects to plot both training data in EDA and testing data in assessment part, and we need data frame objects to model which do not deal with geometry information)
set.seed(20220422)
flood_split <- initial_split(flood_demography_sf, prop = 0.7, strata = "flood")
flood_train_sf <- training(flood_split)
flood_test_sf <- testing(flood_split)
flood_train_df <- flood_train_sf %>%
st_centroid() %>%
mutate(lat = unlist(map(geometry, 1)), lon = unlist(map(geometry, 2))) %>%
na.omit() %>%
as_tibble() %>%
select(-c(geometry, geoid, census_tract, county))
flood_test_df <- flood_test_sf %>%
st_centroid() %>%
mutate(lat = unlist(map(geometry, 1)), lon = unlist(map(geometry, 2))) %>%
na.omit() %>%
as_tibble() %>%
select(-geometry, geoid, census_tract, county)
#Visualize the relationship between demographic data and flooding
#Descriptive Analysis
flood_train_sf %>%
summary()
## geoid census_tract county total_by_age
## Min. :3.601e+10 Length:1628 Length:1628 Min. : 0
## 1st Qu.:3.605e+10 Class :character Class :character 1st Qu.: 2216
## Median :3.606e+10 Mode :character Mode :character Median : 3402
## Mean :3.606e+10 Mean : 3598
## 3rd Qu.:3.608e+10 3rd Qu.: 4669
## Max. :3.609e+10 Max. :16600
## male_by_age_total female_by_age_total white black
## Min. : 0 Min. : 0 Min. : 0 Min. : 0.0
## 1st Qu.:1069 1st Qu.:1135 1st Qu.: 408 1st Qu.: 53.0
## Median :1603 Median :1754 Median : 1124 Median : 264.5
## Mean :1719 Mean :1879 Mean : 1487 Mean : 855.9
## 3rd Qu.:2250 3rd Qu.:2454 3rd Qu.: 2103 3rd Qu.:1426.0
## Max. :7280 Max. :9320 Max. :11625 Max. :7248.0
## indian_alaska asian pacific no_schooling
## Min. : 0.00 Min. : 0.0 Min. : 0.00 Min. : 0.0
## 1st Qu.: 0.00 1st Qu.: 61.0 1st Qu.: 0.00 1st Qu.: 17.0
## Median : 0.00 Median : 233.5 Median : 0.00 Median : 50.5
## Mean : 14.88 Mean : 519.0 Mean : 2.32 Mean : 77.2
## 3rd Qu.: 11.00 3rd Qu.: 679.8 3rd Qu.: 0.00 3rd Qu.:109.0
## Max. :971.00 Max. :5687.0 Max. :405.00 Max. :785.0
## elementary_school middle_school high_school bachelor
## Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 259.5 1st Qu.: 246.0
## Median : 4.00 Median : 22.00 Median : 449.5 Median : 428.5
## Mean : 18.67 Mean : 41.48 Mean : 505.3 Mean : 574.9
## 3rd Qu.: 25.00 3rd Qu.: 56.00 3rd Qu.: 696.0 3rd Qu.: 752.2
## Max. :417.00 Max. :798.00 Max. :2685.0 Max. :5903.0
## poverty employed unemployed hhincome
## Min. : 0.0 Min. : 0 Min. : 0.0 Min. : 2499
## 1st Qu.: 181.8 1st Qu.:1020 1st Qu.: 45.0 1st Qu.: 50856
## Median : 401.5 Median :1592 Median : 92.0 Median : 70978
## Mean : 617.1 Mean :1731 Mean : 122.5 Mean : 74456
## 3rd Qu.: 816.5 3rd Qu.:2222 3rd Qu.: 168.0 3rd Qu.: 90242
## Max. :4350.0 Max. :9758 Max. :1129.0 Max. :250001
## rate_of_minorities employment_rate geometry flood
## Min. : 0.00 Min. : 0.00 MULTIPOLYGON :1628 0:1054
## 1st Qu.: 20.80 1st Qu.: 90.80 epsg:4326 : 0 1: 574
## Median : 46.60 Median : 94.10 +proj=long...: 0
## Mean : 48.52 Mean : 89.68
## 3rd Qu.: 77.55 3rd Qu.: 96.40
## Max. :100.00 Max. :100.00
# Relationship Between Rate of Employment, Rate of Minorities and Poverty
# Correlation Test among white, male_by_age_total, hhincome, employment_rate
#P2 <- flood_train_df %>%
# select(white, male_by_age_total, hhincome, employment_rate) %>%
# na.omit()
#P2 <- round(cor(P2, method = "spearman"), 1)
#Reject the null hypothesis at a 95% significance level that the incidence of poverty and an observation with no education is not correlated. This also shows a positive correlation of 0.44
#Flooding Data
#Graph to show median household income in floodplain by county
flood_train_sf_plot <- flood_train_sf %>%
group_by(geoid, county, census_tract) %>%
mutate(tooltip = paste(county, census_tract, hhincome, sep = ": "))
P3 <- ggplot() +
geom_sf_interactive(data = flood_train_sf_plot, color = "white", aes(fill = hhincome,
tooltip = tooltip, data_id = hhincome), size = 0.2) +
geom_sf(data = filter(flood_train_sf, flood == 1), fill = "black", color = "white", alpha = 0.2) +
scale_fill_distiller(palette = "Spectral", direction = 1, labels = label_dollar()) +
labs(title = "Median Household Income in New York City",
subtitle = "Flooding Impact by Household Income",
caption = "Data source: 2019 5-year ACS, US Census Bureau",
fill = "Household Income") +
theme_void()
girafe(ggobj = P3) %>%
girafe_options(opts_hover(css = "fill:orange;"),
opts_zoom(max = 10))
#Graph to show rate of minorities in floodplain by county
flood_train_sf_plot <- flood_train_sf %>%
group_by(geoid, county, census_tract) %>%
mutate(tooltip = paste(county, census_tract, rate_of_minorities, sep = ": "))
P4 <- ggplot() +
geom_sf_interactive(data = flood_train_sf_plot, color = "white", aes(fill = rate_of_minorities,
tooltip = tooltip, data_id = rate_of_minorities), size = 0.2) +
geom_sf(data = filter(flood_train_sf, flood == 1), fill = "red", color = "white", alpha = 0.2, show.legend = "point", lwd = 2) +
scale_fill_viridis_c() +
# scale_fill_distiller(palette = "Blues", direction = 1) +
labs(title = "Rate of Minorities in New York City",
subtitle = "Flooding Impact by Rate of Minorities",
caption = "Data source: 2019 5-year ACS, US Census Bureau",
fill = "Rate of Minorities") +
theme_void()
girafe(ggobj = P4) %>%
girafe_options(opts_hover(css = "fill:orange;"),
opts_zoom(max = 10))
#Graph to show employment rate income in floodplain by county
flood_train_sf_plot <- flood_train_sf %>%
group_by(geoid, county, census_tract) %>%
mutate(tooltip = paste(county, census_tract, employment_rate, sep = ": "))
P5 <- ggplot() +
geom_sf_interactive(data = flood_train_sf_plot, color = "white", aes(fill = employment_rate,
tooltip = tooltip, data_id = employment_rate), size = 0.2) +
geom_sf(data = filter(flood_train_sf, flood == 1), fill = "red", color = "white", alpha = 0.2) +
scale_fill_distiller(palette = "Blues", direction = 1) +
labs(title = "Rate of Employment in New York City",
subtitle = "Flooding Impact by Rate of Employment",
caption = "Data source: 2019 5-year ACS, US Census Bureau",
fill = "Rate of Employment") +
theme_void()
girafe(ggobj = P5) %>%
girafe_options(opts_hover(css = "fill:orange;"),
opts_zoom(max = 10))
#Graph to show poverty rate income in floodplain by county
flood_train_sf_plot <- flood_train_sf %>%
group_by(geoid, county, census_tract) %>%
mutate(tooltip = paste(county, census_tract, poverty, sep = ": "))
P6 <- ggplot() +
geom_sf_interactive(data = flood_train_sf_plot, color = "white", aes(fill = poverty,
tooltip = tooltip, data_id = poverty), size = 0.2) +
geom_sf(data = filter(flood_train_sf, flood == 1), fill = "red", color = "white", alpha = 0.2) +
labs(title = " Poverty in New York City",
subtitle = "Flooding Impact by Incidences of Poverty",
caption = "Data source: 2019 5-year ACS, US Census Bureau",
fill = "Poverty Levels") +
theme_void()
girafe(ggobj = P6) %>%
girafe_options(opts_hover(css = "fill:orange;"),
opts_zoom(max = 10))
flood_rec <-
recipe(flood ~ ., data = flood_train_df) %>%
# center and scale all predictors
step_center(all_predictors()) %>%
step_scale(all_predictors())
flood_folds <- vfold_cv(flood_train_df, v = 10, repeats = 1)
# Decision Tree Model
cart_mod <- decision_tree(tree_depth = tune()) %>%
set_engine(engine = "rpart") %>%
set_mode(mode = "classification")
cart_grid <- grid_regular(tree_depth(), levels = 10)
cart_wf <- workflow() %>%
add_recipe(flood_rec) %>%
add_model(cart_mod)
cart_cv <- cart_wf %>%
tune_grid(
resamples = flood_folds,
grid = cart_grid,
metrics = metric_set(roc_auc)
)
# Logistic Model
# run an initial logistic regression to get data for variable importance
floodlogit <- glm(flood ~., data = flood_train_df, family = "binomial")
# displays importance of 10 most important variables
importances <- varImp(floodlogit)
importances %>%
arrange(desc(Overall)) %>%
top_n(10)
## Overall
## employed 6.385857
## employment_rate 5.805587
## bachelor 4.560309
## unemployed 3.768257
## asian 3.305981
## total_by_age 3.136806
## lon 2.961183
## hhincome 1.727637
## lat 1.636568
## poverty 1.284834
# According to https://stackoverflow.com/questions/47822694/logistic-regression-tuning-parameter-grid-in-r-caret-package/48218280#48218280, there is no tunning parameter for glm model.
logistic_mod <- logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification")
logistic_wf <- workflow() %>%
add_model(logistic_mod) %>%
add_recipe(flood_rec)
logistic_cv <- logistic_wf %>%
fit_resamples(
resample = flood_folds,
metrics = metric_set(roc_auc)
)
# KNN Model
flood_knn_mod <-
nearest_neighbor(neighbors = tune()) %>%
set_engine(engine = "kknn") %>%
set_mode(mode = "classification")
knn_grid <- grid_regular(neighbors(), levels = 10)
flood_knn_wf <- workflow() %>%
add_recipe(flood_rec) %>%
add_model(flood_knn_mod)
flood_knn_cv <- flood_knn_wf %>%
tune_grid(
resample = flood_folds,
grid = knn_grid,
metrics = metric_set(roc_auc)
)
cart_roc <- cart_cv %>%
collect_metrics(summarize = FALSE) %>%
group_by(id) %>%
summarise(cart_roc_auc = mean(.estimate))
logistic_roc <- logistic_cv %>%
collect_metrics(summarize = FALSE) %>%
group_by(id) %>%
summarise(logistic_roc_auc = mean(.estimate))
knn_roc <- flood_knn_cv %>%
collect_metrics(summarize = FALSE) %>%
group_by(id) %>%
summarise(knn_roc_auc = mean(.estimate))
roc_auc <- bind_cols(
fold = cart_roc$id,
cart = cart_roc$cart_roc_auc,
logistic = logistic_roc$logistic_roc_auc,
knn = knn_roc$knn_roc_auc
) %>%
pivot_longer(
cols = 2:4,
names_to = "model",
values_to = "roc_auc"
)
P7 <- roc_auc %>%
ggplot() +
geom_boxplot(
aes(x = model, y = roc_auc)
)
T1 <- roc_auc %>%
group_by(model) %>%
summarise(
roc_auc = mean(roc_auc)
)
P7 + gridExtra::tableGrob(T1)
# CART is the best model
flood_cart_best <- cart_cv %>%
select_best(metric = "roc_auc")
flood_final_wf <- cart_wf %>%
finalize_workflow(
parameters = flood_cart_best
)
flood_final_fit <- flood_final_wf %>%
fit(data = flood_train_df)
rpart.plot::rpart.plot(x = flood_final_fit$fit$fit$fit)
flood_assess <- bind_cols(
flood_test_sf,
predict(object = flood_final_fit, new_data = flood_test_df, type = "class"),
predict(object = flood_final_fit, new_data = flood_test_df, type = "prob")
) %>%
mutate(
assessment = case_when(
flood == 1 & .pred_class == 1 ~ "true positive",
flood == 1 & .pred_class == 0 ~ "false negative",
flood == 0 & .pred_class == 1 ~ "false positive",
flood == 0 & .pred_class == 0 ~ "true negative"
)
)
flood_assess %>%
roc_curve(truth = flood, estimate = .pred_0) %>%
ggplot(aes(x = 1 - specificity, y = sensitivity)) +
geom_path() +
geom_abline(lty = 3) +
coord_equal() +
theme_minimal()
flood_assess %>%
ggplot() +
geom_sf(aes(fill = assessment), color = "white")+
theme_void() +
labs(
title = "Assessment of the Finalized Knn Model",
fill = ""
)
# (1) Load demographic data 2012
credential <- Sys.getenv("census_api_key")
lookup_var <- load_variables(2012, "acs5", cache = TRUE)
demo2012 <- get_acs(
geography = "tract",
year = 2012,
variables = c(
race,
education_attainment,
employment_status,
sex_by_age,
hhincome = "B19013_001",
poverty = "C17002_001"),
state = 36,
key = credential
)
# (2) Tidy demographic data
demo2012_clean <- demo2012 %>%
select(-moe, -NAME) %>%
rename(geoid = GEOID) %>%
pivot_wider(
names_from = variable,
values_from = estimate
)
# (3) Mutate variables
# The minorities rate
demo2012_clean$rate_of_minorities <- round((demo2012_clean$black + demo2012_clean$asian + demo2012_clean$indian_alaska + demo2012_clean$pacific)/(demo2012_clean$white + demo2012_clean$black + demo2012_clean$asian + demo2012_clean$indian_alaska + demo2012_clean$pacific) * 100, 1) %>%
replace(., is.na(.), 0)
# The employment rate
demo2012_clean$employment_rate <- round((demo2012_clean$employed)/(demo2012_clean$employed + demo2012_clean$unemployed) * 100, 1) %>%
replace(., is.nan(.), 0)
# (4) Convert into sf object
# NOTE: As Sandy happened in 2012, we use 2010 geographic data instead of 2020 one.
geo2010 <- tracts(
state = 36,
cb = TRUE,
year = 2010
) %>%
st_transform(crs = 4326)
##
|
| | 0%
|
|== | 2%
|
|=== | 4%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 8%
|
|======== | 11%
|
|========= | 12%
|
|========= | 13%
|
|========== | 15%
|
|============ | 16%
|
|============ | 17%
|
|============= | 19%
|
|============== | 21%
|
|=============== | 22%
|
|================ | 23%
|
|================= | 24%
|
|================= | 25%
|
|================== | 26%
|
|=================== | 27%
|
|==================== | 29%
|
|====================== | 31%
|
|======================= | 33%
|
|======================== | 34%
|
|========================= | 36%
|
|========================== | 37%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|=============================== | 44%
|
|================================ | 45%
|
|================================= | 47%
|
|================================== | 48%
|
|=================================== | 50%
|
|=================================== | 51%
|
|===================================== | 52%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 56%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|=========================================== | 62%
|
|============================================= | 64%
|
|============================================= | 65%
|
|============================================== | 66%
|
|================================================ | 69%
|
|================================================= | 70%
|
|================================================== | 71%
|
|=================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|====================================================== | 77%
|
|======================================================= | 79%
|
|======================================================== | 80%
|
|========================================================= | 81%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================== | 89%
|
|=============================================================== | 90%
|
|================================================================ | 91%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 95%
|
|==================================================================== | 97%
|
|==================================================================== | 98%
|
|======================================================================| 99%
|
|======================================================================| 100%
geo2010 <- geo2010 %>%
select(-STATE, -NAME, -LSAD, -CENSUSAREA, -COUNTYFP, -STATEFP) %>%
clean_names() %>%
mutate(geoid = substr(geo_id, 10, 20)) %>%
select(-geo_id)
demo2012_sf <- left_join(
x = demo2012_clean,
y = geo2010
) %>%
st_as_sf() %>%
st_transform(crs = 4326)
# Flooding damage
sandy_damage <- read_csv("data/Sandy_Damage_Estimates_by_Block_Group.csv") %>%
select(GEOID, total_dmg) %>%
clean_names() %>%
filter(str_detect(geoid, "^36")) %>%
mutate(geoid = substr(geoid, 1, 11)) %>%
group_by(geoid) %>%
mutate(total_dmg = sum(total_dmg)) %>%
unique()
# Sandy high water mark (hwm)
sandy_hwm_p <- read_csv("data/FilteredHWMs.csv") %>%
clean_names() %>%
select(latitude, longitude, state_name, elev_ft) %>%
filter(state_name == "NY") %>%
select(-state_name) %>%
st_as_sf(coords = c("longitude", "latitude"), crs = 4326)
sf_use_s2(FALSE)
sandy_hwm_ct <- st_join(geo2010, sandy_hwm_p, join = st_contains) %>%
filter(!is.na(elev_ft))
sandy_dmg_hwm <- inner_join(sandy_hwm_ct, sandy_damage)
sandy_modeling_data <- st_join(sandy_dmg_hwm, demo2012_sf, join = st_equals) %>%
select(-county.y, -tract.y, -geoid.y) %>%
rename(county = county.x, tract = tract.x, geoid = geoid.x) %>%
na.omit()
set.seed(20220422)
sandy_split <- initial_split(sandy_modeling_data, prop = 0.75)
sandy_train_sf <- training(sandy_split)
sandy_test_sf <- testing(sandy_split)
sandy_train_df <- sandy_train_sf %>%
# st_centroid() %>%
# mutate(lat = unlist(map(geometry, 1)), lon = unlist(map(geometry, 2))) %>%
# na.omit() %>%
as_tibble() %>%
select(-c(county, tract, geoid, geometry))
sandy_test_df <- sandy_test_sf %>%
# st_centroid() %>%
# mutate(lat = unlist(map(geometry, 1)), lon = unlist(map(geometry, 2))) %>%
# na.omit() %>%
as_tibble() %>%
select(-c(county, tract, geoid, geometry))
P8 <- sandy_train_sf %>%
ggplot() +
geom_sf(aes(fill = total_dmg), color = "white") +
scale_fill_gradient(
low = "yellow",
high = "red"
)
P9 <- sandy_train_sf %>%
ggplot() +
geom_sf(aes(fill = elev_ft), color = "white") +
scale_fill_gradient(
low = "yellow",
high = "red"
)
P8 / P9
# The elevation of water and the geometric information explains part of the total damage caused by Sandy hurricane.
sandy_EDA <- function(demo_var){
P <- sandy_train_sf %>%
mutate(
elevation = case_when(
elev_ft < 6.6 ~ "Min-Q1",
elev_ft < 8.95 ~ "Q1-Median",
elev_ft < 10.775 ~ "Median-Q3",
TRUE ~ "Q3-Max"
)
) %>%
rename("estimate" = demo_var) %>%
ggplot() +
geom_point(
aes(estimate, total_dmg, color = elevation, shape = elevation),
size = 3,
alpha =0.7
) +
labs(
x = demo_var,
y = "total claim of damage"
)
assign(paste("P", demo_var, sep = "_"), P, envir = .GlobalEnv)
}
#wrap_plots(
# map(.x = c("rate_of_minorities", "bachelor", "hhincome", "employment_rate"), .f = sandy_EDA)
#)
#Holding the elevation of water at the constant level, the number of white in one census tract positively correlates to the total claims of damage.
sandy_folds <- vfold_cv(sandy_train_df, v = 10, repeats = 1)
sandy_rec <- recipe(total_dmg ~ ., data = sandy_train_df) %>%
step_center(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors())
# Random Forest Model
sandy_rf_mod <- rand_forest(mtry = tune(), trees = tune()) %>%
set_engine("randomForest") %>%
set_mode("regression")
rf_grid<- sandy_rf_mod %>%
parameters() %>%
finalize(sandy_train_df) %>%
grid_latin_hypercube(size = 10)
sandy_rf_wf <- workflow() %>%
add_recipe(sandy_rec) %>%
add_model(sandy_rf_mod)
sandy_rf_cv <- sandy_rf_wf %>%
tune_grid(
resample = sandy_folds,
grid = rf_grid,
metrics = metric_set(rmse)
)
## Elastic Net Model
sandy_en_mod <- linear_reg(penalty = tune(), mixture = 0.5) %>%
set_engine("glmnet") %>%
set_mode("regression")
en_grid <- grid_regular(penalty(), levels = 10)
sandy_en_wf <- workflow() %>%
add_recipe(sandy_rec) %>%
add_model(sandy_en_mod)
sandy_en_cv <- sandy_en_wf %>%
tune_grid(
resample = sandy_folds,
grid = en_grid,
metrics = metric_set(rmse)
)
## KNN Model
sandy_knn_mod <- nearest_neighbor(neighbors = tune()) %>%
set_engine("kknn") %>%
set_mode("regression")
knn_grid <- grid_regular(neighbors(), levels = 10)
sandy_knn_wf <- workflow() %>%
add_recipe(sandy_rec) %>%
add_model(sandy_knn_mod)
sandy_knn_cv <- sandy_knn_wf %>%
tune_grid(
resample = sandy_folds,
grid = knn_grid,
metrics = metric_set(rmse)
)
sandy_rf_rmse <- sandy_rf_cv %>%
collect_metrics(summarize = FALSE) %>%
group_by(id) %>%
summarise(sandy_rf_rmse = mean(.estimate))
sandy_en_rmse <- sandy_en_cv %>%
collect_metrics(summarize = FALSE) %>%
group_by(id) %>%
summarise(sandy_en_rmse = mean(.estimate))
sandy_knn_rmse <- sandy_knn_cv %>%
collect_metrics(summarize = FALSE) %>%
group_by(id) %>%
summarise(sandy_knn_rmse = mean(.estimate))
sandy_rmse <- bind_cols(
fold = sandy_rf_rmse$id,
sandy_rf = sandy_rf_rmse$sandy_rf_rmse,
sandy_en = sandy_en_rmse$sandy_en_rmse,
sandy_knn = sandy_knn_rmse$sandy_knn_rmse
) %>%
pivot_longer(
cols = 2:4,
names_to = "model",
values_to = "rmse"
)
P10 <- sandy_rmse %>%
ggplot() +
geom_boxplot(
aes(x = model, y = rmse)
)
T2 <- sandy_rmse %>%
group_by(model) %>%
summarise(
rmse = mean(rmse)
)
P10 + gridExtra::tableGrob(T2)
## Randome Forest is the best model among the three.
sandy_rf_best <- sandy_rf_cv %>%
select_best(metric = "rmse")
sandy_final_wf <- sandy_rf_wf %>%
finalize_workflow(
parameters = sandy_rf_best
)
sandy_final_fit <- sandy_final_wf %>%
fit(data = sandy_train_df)
sandy_prediction <- sandy_final_fit %>%
predict(new_data = tibble(sandy_test_df))
sandy_assess <- bind_cols(
sandy_test_sf,
predict_dmg = sandy_prediction$.pred
)
tibble(sandy_assess) %>%
rmse(truth = total_dmg, estimate = predict_dmg)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 464.
tibble(sandy_assess) %>%
select(total_dmg, predict_dmg) %>%
pivot_longer(
cols = 1:2,
names_to = "true_prediction",
values_to = "dmg"
) %>%
ggplot() +
geom_density(aes(x = dmg, group = true_prediction, fill = true_prediction, color = true_prediction), alpha = 0.5) +
labs(x = "damage")
sandy_assess %>%
mutate(residual = total_dmg - predict_dmg) %>%
ggplot() +
geom_sf(aes(fill = residual), color = "white") +
scale_fill_gradient2(
low = "coral1",
mid = "turquoise3",
high = "coral1",
limits = c(-100,100),
guide = guide_colourbar(barheight = unit(4, "cm"))
) +
theme_void()